{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 爬虫基类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "\n",
    "class MyCrawler:\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "        self.headers =  {\n",
    "            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',\n",
    "        }\n",
    "    \n",
    "    def download(self, url):\n",
    "        r = requests.get(url, headers=self.headers)\n",
    "        return r.text\n",
    "    \n",
    "    def extract(self, content, pattern):\n",
    "        result = re.findall(pattern, content)\n",
    "        return result\n",
    "    \n",
    "    def save(self, info):\n",
    "        with open(self.filename, 'a', encoding='utf-8') as f:\n",
    "            for item in info:\n",
    "                f.write('|||'.join(item) + '\\n')\n",
    "    \n",
    "    def crawl(self, url, pattern, headers=None):\n",
    "        if headers:\n",
    "            self.headers.update(headers)\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from lxml import html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting package metadata (current_repodata.json): ...working... done\n",
      "Solving environment: ...working... done\n",
      "\n",
      "# All requested packages already installed.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!conda install requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "douban_crawler = MyCrawler('douban.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "content = douban_crawler.download('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree = html.fromstring(content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "book_names = tree.xpath('//h2/a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'神经网络与深度学习'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_names[0].text.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "xpath_str = "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['神经网络与深度学习',\n",
       " 'Python神经网络编程',\n",
       " 'Python深度学习',\n",
       " '深度学习入门',\n",
       " '深度学习的数学',\n",
       " '数字思维',\n",
       " '深入浅出图神经网络：GNN原理解析',\n",
       " 'Neural Networks and Deep Learning',\n",
       " '神经网络设计（原书第2版）',\n",
       " '神经网络在应用科学和工程中的应用',\n",
       " 'Make Your Own Neural Network',\n",
       " '图解深度学习与神经网络：从张量到TensorFlow实现',\n",
       " 'MATLAB神经网络43个案例分析',\n",
       " 'Neural Networks and Learning Machines',\n",
       " '神经网络原理(原书第2版)',\n",
       " '连接组：造就独一无二的你',\n",
       " '神经网络控制',\n",
       " 'Neural Networks and Statistical Learning',\n",
       " '意识的宇宙',\n",
       " 'Unsupervised Learning']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(map(lambda x: x.text.strip(), tree.xpath(\"//h2/a\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['邱锡鹏 / 机械工业出版社 / 2020-4-10 / 149.00元',\n",
       " '[英]塔里克·拉希德（Tariq Rashid） / 林赐 / 人民邮电出版社 / 2018-4 / 69.00元',\n",
       " '[美] 弗朗索瓦•肖莱 / 张亮 / 人民邮电出版社 / 2018-8 / 119.00元',\n",
       " '[ 日］  斋藤康毅 / 陆宇杰 / 人民邮电出版社 / 2018-7 / 59.00元',\n",
       " '[日]涌井良幸、[日]涌井贞美 / 杨瑞龙 / 人民邮电出版社 / 2019-4 / 69.00元',\n",
       " '[葡] 阿林多•奥利维拉 / 胡小锐 / 中信出版社 / 2020-1-1 / 69.00',\n",
       " '刘忠雨\\u3000李彦霖\\u3000周洋\\u3000著 / 机械工业出版社 / 2019-12-25 / 89元',\n",
       " 'Michael Nielsen / 2016-1',\n",
       " 'Martin T. Hagan、Howard B. Demuth、Mark H. Beale / 章毅 / 机械工业出版社 / 2017-11 / 99.00元',\n",
       " '萨马拉辛荷 / 2010-1 / 88.00元',\n",
       " 'Tariq Rashid / CreateSpace Independent Publishing Platform / 2016-3-31 / USD 45.00',\n",
       " '张平 / 电子工业出版社 / 2018-10 / 79.00元',\n",
       " '王小川、史峰、郁磊、李洋 / 北京航空航天大学出版社 / 2013-8-1 / CNY 48.00',\n",
       " 'Simon O. Haykin / Pearson / 2008-11-28 / USD 252.40',\n",
       " 'Simon Haykin / 叶世伟、史忠植 / 机械工业出版社 / 2004-1 / 69.00元',\n",
       " '[美] 承现峻 / 孙天齐 / 清华大学出版社 / 2016-1 / 45',\n",
       " '徐丽娜 / 2009-7 / 28.00元',\n",
       " 'Ke-Lin Du、M. N. S. Swamy / Springer / 2013-12-7 / USD 129.00',\n",
       " '[美] 杰拉尔德·埃德尔曼、[美] 朱利欧·托诺尼 / 顾凡及 / 上海科学技术出版社 / 2004-1 / 27.00元',\n",
       " 'A Bradford Book / 1999-6-11 / USD 40.00']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(map(lambda x: x.text.strip(), tree.xpath(\"//div[@class='pub']\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['本书主要介绍神经网络与深度学习中的基础知识、主要模型（卷积神经网络、递归神经网络等）以及在计算机视觉、自然语言处理等领域的应用。',\n",
       " '神经网络是一种模拟人脑的神经网络，以期能够实现类人工智能的机器学习\\n技术。\\n本书揭示神经网络背后的概念，并介绍如何通过Python实现神经网络。全书\\n分为3...',\n",
       " '本书由Keras之父、现任Google人工智能研究员的弗朗索瓦•肖莱（François Chollet）执笔，详尽介绍了用Python和Keras进行深度学...',\n",
       " '本书是深度学习真正意义上的入门书，深入浅出地剖析了深度学习的原理和相关技术。书中使用Python3，尽量不依赖外部库或工具，从基本的数学知识出发，带领读者从...',\n",
       " '《深度学习的数学》基于丰富的图示和具体示例，通俗易懂地介绍了深度学习相关的数学知识。第1章介绍神经网络的概况；第2章介绍理解神经网络所需的数学基础知识；第3...',\n",
       " '计算机、细胞和大脑有什么共同之处？计算机是人类设计的电子设备，细胞是经自然进化和选择产生的生物实体，大脑是人类思维的创造者和“容器”。但在某种程度上，它们都...',\n",
       " '这是一本从原理、算法、实现、应用4个维度详细讲解图神经网络的著作，在图神经网络领域具有重大的意义。\\n本书作者是图神经网络领域的资深技术专家，作者所在的公司极...',\n",
       " 'http://neuralnetworksanddeeplearning.com/',\n",
       " '本书是一本易学易懂的神经网络教材，主要讨论网络结构、学习规则、训练技巧和工程应用，紧紧围绕“设计”这一视角组织材料和展开讲解，强调基本原理和训练方法，概念清...',\n",
       " '《神经网络在应用科学与工程中的应用:从基本原理到复杂的模式识别》为读者提供了神经网络方面简单但却系统的介绍。\\n《神经网络在应用科学和工程中的应用从基本原理到...',\n",
       " '《图解深度学习与神经网络：从张量到TensorFlow实现》是以TensorFlow 为工具介绍神经网络和深度学习的入门书，内容循序渐进，以简单示例和图例的...',\n",
       " 'For graduate-level neural network courses offered in the departments of Comput...',\n",
       " '★《华尔街日报》2012年度十佳非虚构图书\\n★亚马逊网站2012年编辑选择之百佳图书\\n★《出版人周刊》2012年春季十佳科学类图书\\n【内容简介】\\n基因组让你...',\n",
       " '神经网络控制已发展成为“智能控制”的一个新的分支，属先进控制技术，为解决复杂的非线性、不确定、不确知系统的控制问题，开辟了一条新的途径。《神经网络控制(第3...',\n",
       " '本书对意识理论进行全面研究，建立在近代神经科学基础上、致力于对意识的产生、及人们对意识的认识如何帮助其“把严格的科学描述与人类知识和经验的宽广领域联系起来”...',\n",
       " 'Since its founding in 1989 by Terrence Sejnowski, Neural Computation has becom...']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(map(lambda x: x.text.strip(), tree.xpath(\"//div[@class='info']/p\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[1;31mInit signature:\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m/\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
       "\u001b[1;31mDocstring:\u001b[0m     \n",
       "map(func, *iterables) --> map object\n",
       "\n",
       "Make an iterator that computes the function using arguments from\n",
       "each of the iterables.  Stops when the shortest iterable is exhausted.\n",
       "\u001b[1;31mType:\u001b[0m           type\n",
       "\u001b[1;31mSubclasses:\u001b[0m     \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "book_infos = tree.xpath(\"//li[@class='subject-item']\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "神经网络与深度学习 https://book.douban.com/subject/35044046/ 邱锡鹏 / 机械工业出版社 / 2020-4-10 / 149.00元 \n",
      " 本书主要介绍神经网络与深度学习中的基础知识、主要模型（卷积神经网络、递归神经网络等）以及在计算机视觉、自然语言处理等领域的应用。\n",
      "Python神经网络编程 https://book.douban.com/subject/30192800/ [英]塔里克·拉希德（Tariq Rashid） / 林赐 / 人民邮电出版社 / 2018-4 / 69.00元 \n",
      " 神经网络是一种模拟人脑的神经网络，以期能够实现类人工智能的机器学习\n",
      "技术。\n",
      "本书揭示神经网络背后的概念，并介绍如何通过Python实现神经网络。全书\n",
      "分为3...\n",
      "Python深度学习 https://book.douban.com/subject/30293801/ [美] 弗朗索瓦•肖莱 / 张亮 / 人民邮电出版社 / 2018-8 / 119.00元 \n",
      " 本书由Keras之父、现任Google人工智能研究员的弗朗索瓦•肖莱（François Chollet）执笔，详尽介绍了用Python和Keras进行深度学...\n",
      "深度学习入门 https://book.douban.com/subject/30270959/ [ 日］  斋藤康毅 / 陆宇杰 / 人民邮电出版社 / 2018-7 / 59.00元 \n",
      " 本书是深度学习真正意义上的入门书，深入浅出地剖析了深度学习的原理和相关技术。书中使用Python3，尽量不依赖外部库或工具，从基本的数学知识出发，带领读者从...\n",
      "深度学习的数学 https://book.douban.com/subject/33414479/ [日]涌井良幸、[日]涌井贞美 / 杨瑞龙 / 人民邮电出版社 / 2019-4 / 69.00元 \n",
      " 《深度学习的数学》基于丰富的图示和具体示例，通俗易懂地介绍了深度学习相关的数学知识。第1章介绍神经网络的概况；第2章介绍理解神经网络所需的数学基础知识；第3...\n",
      "数字思维 https://book.douban.com/subject/34941715/ [葡] 阿林多•奥利维拉 / 胡小锐 / 中信出版社 / 2020-1-1 / 69.00 \n",
      " 计算机、细胞和大脑有什么共同之处？计算机是人类设计的电子设备，细胞是经自然进化和选择产生的生物实体，大脑是人类思维的创造者和“容器”。但在某种程度上，它们都...\n",
      "深入浅出图神经网络：GNN原理解析 https://book.douban.com/subject/34927262/ 刘忠雨　李彦霖　周洋　著 / 机械工业出版社 / 2019-12-25 / 89元 \n",
      " 这是一本从原理、算法、实现、应用4个维度详细讲解图神经网络的著作，在图神经网络领域具有重大的意义。\n",
      "本书作者是图神经网络领域的资深技术专家，作者所在的公司极...\n",
      "Neural Networks and Deep Learning https://book.douban.com/subject/26727997/ Michael Nielsen / 2016-1 \n",
      " http://neuralnetworksanddeeplearning.com/\n",
      "神经网络设计（原书第2版） https://book.douban.com/subject/30236893/ Martin T. Hagan、Howard B. Demuth、Mark H. Beale / 章毅 / 机械工业出版社 / 2017-11 / 99.00元 \n",
      " 本书是一本易学易懂的神经网络教材，主要讨论网络结构、学习规则、训练技巧和工程应用，紧紧围绕“设计”这一视角组织材料和展开讲解，强调基本原理和训练方法，概念清...\n",
      "神经网络在应用科学和工程中的应用 https://book.douban.com/subject/4146246/ 萨马拉辛荷 / 2010-1 / 88.00元 \n",
      " 《神经网络在应用科学与工程中的应用:从基本原理到复杂的模式识别》为读者提供了神经网络方面简单但却系统的介绍。\n",
      "《神经网络在应用科学和工程中的应用从基本原理到...\n",
      "Make Your Own Neural Network https://book.douban.com/subject/26945232/ Tariq Rashid / CreateSpace Independent Publishing Platform / 2016-3-31 / USD 45.00 \n",
      " N/A\n",
      "图解深度学习与神经网络：从张量到TensorFlow实现 https://book.douban.com/subject/30333961/ 张平 / 电子工业出版社 / 2018-10 / 79.00元 \n",
      " 《图解深度学习与神经网络：从张量到TensorFlow实现》是以TensorFlow 为工具介绍神经网络和深度学习的入门书，内容循序渐进，以简单示例和图例的...\n",
      "MATLAB神经网络43个案例分析 https://book.douban.com/subject/26388161/ 王小川、史峰、郁磊、李洋 / 北京航空航天大学出版社 / 2013-8-1 / CNY 48.00 \n",
      " N/A\n",
      "Neural Networks and Learning Machines https://book.douban.com/subject/2584657/ Simon O. Haykin / Pearson / 2008-11-28 / USD 252.40 \n",
      " For graduate-level neural network courses offered in the departments of Comput...\n",
      "神经网络原理(原书第2版) https://book.douban.com/subject/1138922/ Simon Haykin / 叶世伟、史忠植 / 机械工业出版社 / 2004-1 / 69.00元 \n",
      " N/A\n",
      "连接组：造就独一无二的你 https://book.douban.com/subject/26666358/ [美] 承现峻 / 孙天齐 / 清华大学出版社 / 2016-1 / 45 \n",
      " ★《华尔街日报》2012年度十佳非虚构图书\n",
      "★亚马逊网站2012年编辑选择之百佳图书\n",
      "★《出版人周刊》2012年春季十佳科学类图书\n",
      "【内容简介】\n",
      "基因组让你...\n",
      "神经网络控制 https://book.douban.com/subject/3890040/ 徐丽娜 / 2009-7 / 28.00元 \n",
      " 神经网络控制已发展成为“智能控制”的一个新的分支，属先进控制技术，为解决复杂的非线性、不确定、不确知系统的控制问题，开辟了一条新的途径。《神经网络控制(第3...\n",
      "Neural Networks and Statistical Learning https://book.douban.com/subject/26422529/ Ke-Lin Du、M. N. S. Swamy / Springer / 2013-12-7 / USD 129.00 \n",
      " N/A\n",
      "意识的宇宙 https://book.douban.com/subject/1159821/ [美] 杰拉尔德·埃德尔曼、[美] 朱利欧·托诺尼 / 顾凡及 / 上海科学技术出版社 / 2004-1 / 27.00元 \n",
      " 本书对意识理论进行全面研究，建立在近代神经科学基础上、致力于对意识的产生、及人们对意识的认识如何帮助其“把严格的科学描述与人类知识和经验的宽广领域联系起来”...\n",
      "Unsupervised Learning https://book.douban.com/subject/6529821/ A Bradford Book / 1999-6-11 / USD 40.00 \n",
      " Since its founding in 1989 by Terrence Sejnowski, Neural Computation has becom...\n"
     ]
    }
   ],
   "source": [
    "for book_info in book_infos:\n",
    "    book_name_elem = book_info.xpath('.//h2/a')[0]\n",
    "    book_name = book_name_elem.text.strip()\n",
    "    book_url = book_name_elem.attrib['href']\n",
    "    book_pub_info = book_info.xpath(\".//div[@class='pub']\")[0].text.strip()\n",
    "    book_intro = 'N/A'\n",
    "    book_intro_elem = book_info.xpath(\".//div[@class='info']/p\")\n",
    "    if book_intro_elem:\n",
    "        book_intro = book_intro_elem[0].text.strip()\n",
    "    print(book_name, book_url, book_pub_info, '\\n', book_intro)\n",
    "#     break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://book.douban.com/subject/35044046/'"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_name_elem.attrib['href']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
